{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "56ea28e3-c67e-4177-a6f3-39395c221e3e",
   "metadata": {},
   "source": [
    "## A03 Move Data to the Exploitation Zone\n",
    "\n",
    "This code will perform data processing and cleaning"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e0228301-8b03-470f-82a1-05a8989a0d03",
   "metadata": {},
   "source": [
    "### 1. init spark session"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c048f9f9-e40c-466a-87db-ceb6c148601a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import findspark\n",
    "findspark.init()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c16a71dc-69df-40ba-b06a-f66d4d319fbb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "spark = SparkSession.builder.appName(\"income_data\").getOrCreate()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f63fe62e-8424-46ea-9e7b-0a0520157f58",
   "metadata": {},
   "source": [
    "### 2. read the parquet file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1ef8ec7a-d7b8-43ac-9af5-0b4f5ca0d659",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+--------------+--------------+----------+--------------------+--------+-------------------------+\n",
      "| Any|Codi_Districte| Nom_Districte|Codi_Barri|           Nom_Barri|Població|Índex RFD Barcelona = 100|\n",
      "+----+--------------+--------------+----------+--------------------+--------+-------------------------+\n",
      "|2007|             1|  Ciutat Vella|         1|            el Raval|   46595|                     64.7|\n",
      "|2007|             1|  Ciutat Vella|         2|      el Barri Gòtic|   27946|                     86.5|\n",
      "|2007|             1|  Ciutat Vella|         3|      la Barceloneta|   15921|                     66.7|\n",
      "|2007|             1|  Ciutat Vella|         4|Sant Pere, Santa ...|   22572|                     80.2|\n",
      "|2007|             6|        Gràcia|        28|Vallcarca i els P...|   15381|                    113.2|\n",
      "|2007|             6|        Gràcia|        29|             el Coll|    7190|                     91.7|\n",
      "|2007|             6|        Gràcia|        30|            la Salut|   13072|                    113.0|\n",
      "|2007|             6|        Gràcia|        31|   la Vila de Gràcia|   50409|                    101.9|\n",
      "|2007|             6|        Gràcia|        32|el Camp d'en Gras...|   34535|                    104.3|\n",
      "|2007|             3|Sants-Montjuïc|        11|        el Poble Sec|   39579|                     73.3|\n",
      "+----+--------------+--------------+----------+--------------------+--------+-------------------------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "file_path = \"./Formatted Zone/income_data.parquet\"\n",
    "\n",
    "df = spark.read.parquet(file_path)\n",
    "\n",
    "df.show(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "cd3c7c04-9a13-44d4-a2cd-3cb8d57666e8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- Any: integer (nullable = true)\n",
      " |-- Codi_Districte: integer (nullable = true)\n",
      " |-- Nom_Districte: string (nullable = true)\n",
      " |-- Codi_Barri: integer (nullable = true)\n",
      " |-- Nom_Barri: string (nullable = true)\n",
      " |-- Població: integer (nullable = true)\n",
      " |-- Índex RFD Barcelona = 100: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f138063e-d834-4216-8a65-53263fc6ba3f",
   "metadata": {},
   "source": [
    "### 3. rename the Índex RFD Barcelona column\n",
    "\n",
    "this column name is too complicated"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d44e1708-4aed-4fe3-959a-135532b1787b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+--------------+-------------+----------+--------------------+--------+-------------------+\n",
      "| Any|Codi_Districte|Nom_Districte|Codi_Barri|           Nom_Barri|Població|Index RFD Barcelona|\n",
      "+----+--------------+-------------+----------+--------------------+--------+-------------------+\n",
      "|2007|             1| Ciutat Vella|         1|            el Raval|   46595|               64.7|\n",
      "|2007|             1| Ciutat Vella|         2|      el Barri Gòtic|   27946|               86.5|\n",
      "|2007|             1| Ciutat Vella|         3|      la Barceloneta|   15921|               66.7|\n",
      "|2007|             1| Ciutat Vella|         4|Sant Pere, Santa ...|   22572|               80.2|\n",
      "|2007|             6|       Gràcia|        28|Vallcarca i els P...|   15381|              113.2|\n",
      "+----+--------------+-------------+----------+--------------------+--------+-------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df = df.withColumnRenamed(\"Índex RFD Barcelona = 100\", \"Index RFD Barcelona\")\n",
    "df.show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1613d0b-48e9-4af6-adb2-67c59f379228",
   "metadata": {},
   "source": [
    "### 4. cast column Index RFD Barcelona to float type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6cfd8d0a-91a6-49ba-89f2-baadff435f14",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql.functions import cast, col"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "6d156ad0-cdc1-4938-9e74-30a496b4c89a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.withColumn(\"Index RFD Barcelona\", col(\"Index RFD Barcelona\").cast(\"float\").alias(\"Index RFD Barcelona\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "582d2d12-2fbe-4b43-b12f-7bbf8139c814",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+--------------+-------------+----------+--------------------+--------+-------------------+\n",
      "| Any|Codi_Districte|Nom_Districte|Codi_Barri|           Nom_Barri|Població|Index RFD Barcelona|\n",
      "+----+--------------+-------------+----------+--------------------+--------+-------------------+\n",
      "|2007|             1| Ciutat Vella|         1|            el Raval|   46595|               64.7|\n",
      "|2007|             1| Ciutat Vella|         2|      el Barri Gòtic|   27946|               86.5|\n",
      "|2007|             1| Ciutat Vella|         3|      la Barceloneta|   15921|               66.7|\n",
      "|2007|             1| Ciutat Vella|         4|Sant Pere, Santa ...|   22572|               80.2|\n",
      "|2007|             6|       Gràcia|        28|Vallcarca i els P...|   15381|              113.2|\n",
      "+----+--------------+-------------+----------+--------------------+--------+-------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "6261b625-f9ac-44ca-905d-d6ac69a231ae",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- Any: integer (nullable = true)\n",
      " |-- Codi_Districte: integer (nullable = true)\n",
      " |-- Nom_Districte: string (nullable = true)\n",
      " |-- Codi_Barri: integer (nullable = true)\n",
      " |-- Nom_Barri: string (nullable = true)\n",
      " |-- Població: integer (nullable = true)\n",
      " |-- Index RFD Barcelona: float (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8d63b307-4650-4690-967e-dd953694d427",
   "metadata": {},
   "source": [
    "### 5. check the null fields"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "82cbf526-d3c6-43fb-98c4-98afaf5c4da7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql.functions import when, col, sum"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "f69ce964-d7e4-42c3-9838-4e9bc3397c30",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------+-------------+----------+---------+--------+-------------------+\n",
      "|Any|Codi_Districte|Nom_Districte|Codi_Barri|Nom_Barri|Població|Index RFD Barcelona|\n",
      "+---+--------------+-------------+----------+---------+--------+-------------------+\n",
      "|  0|             0|            0|         0|        0|       0|                  8|\n",
      "+---+--------------+-------------+----------+---------+--------+-------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "missing_values_df = df.select(\n",
    "    [sum(when(col(column).isNull(), 1).otherwise(0)).alias(column) for column in df.columns]\n",
    ")\n",
    "\n",
    "missing_values_df.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fc698566-cf0a-4fac-9158-2dfa68158a86",
   "metadata": {},
   "source": [
    "### 6. drop the column with missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "746354a8-12bd-4a49-b457-d9d20e0d92dd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+--------------+-------------+----------+--------------------+--------+-------------------+\n",
      "| Any|Codi_Districte|Nom_Districte|Codi_Barri|           Nom_Barri|Població|Index RFD Barcelona|\n",
      "+----+--------------+-------------+----------+--------------------+--------+-------------------+\n",
      "|2007|             1| Ciutat Vella|         1|            el Raval|   46595|               64.7|\n",
      "|2007|             1| Ciutat Vella|         2|      el Barri Gòtic|   27946|               86.5|\n",
      "|2007|             1| Ciutat Vella|         3|      la Barceloneta|   15921|               66.7|\n",
      "|2007|             1| Ciutat Vella|         4|Sant Pere, Santa ...|   22572|               80.2|\n",
      "|2007|             6|       Gràcia|        28|Vallcarca i els P...|   15381|              113.2|\n",
      "+----+--------------+-------------+----------+--------------------+--------+-------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df = df.na.drop()\n",
    "df.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "0b1b165f-94bd-495b-93fe-4715e527113e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------+-------------+----------+---------+--------+-------------------+\n",
      "|Any|Codi_Districte|Nom_Districte|Codi_Barri|Nom_Barri|Població|Index RFD Barcelona|\n",
      "+---+--------------+-------------+----------+---------+--------+-------------------+\n",
      "|  0|             0|            0|         0|        0|       0|                  0|\n",
      "+---+--------------+-------------+----------+---------+--------+-------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "missing_values_df = df.select(\n",
    "    [sum(when(col(column).isNull(), 1).otherwise(0)).alias(column) for column in df.columns]\n",
    ")\n",
    "\n",
    "missing_values_df.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7afc5c63-6e8c-4a2c-9b6f-dc6e5546269a",
   "metadata": {},
   "source": [
    "### 7. save to parquet file and csv file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "6fac4817-5259-47fd-9cf0-e0fa6a7ab96b",
   "metadata": {},
   "outputs": [],
   "source": [
    "output_path = \"./Exploitation Zone/income_data.parquet\"\n",
    "\n",
    "df.write.mode(\"overwrite\").parquet(output_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "6293c3fd-68c7-43ad-a1fe-550ecb9c27ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "output_path = \"./Exploitation Zone/income_data.csv\"\n",
    "\n",
    "df.write.mode(\"overwrite\").csv(output_path, header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdb90bfa-023e-44a8-b754-120a88d88be0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
